library(survival)
library(tidyverse)
library(gtsummary)
library(purrr)
library(zoo) # For na.locf (last observation carried forward)
library(MASS)
library(ncvreg)
library(devtools)
library(Cyclops)
library(BrokenAdaptiveRidge)
library(flextable)
library(gt)
library(PAmeasures)
library(pec)
library(survMisc)
data.path <- "/Users/jacenai/Desktop/BIOS_215/Project/AIDS/"
aids <- read.csv(file.path(data.path, "AIDS.data.csv")) %>%
mutate(trt = ifelse(trt %in% c(0, 3), 0, 1)) %>%
dplyr::select(-pidnum, -hemo, -zprior, -cd40, -cd420, -treat)
aids.table <- read.csv(file.path(data.path, "AIDS.data.csv")) %>%
mutate(trt = ifelse(trt %in% c(0, 3), 0, 1)) %>%
dplyr::select(-pidnum, -hemo, -zprior, -cd420) %>%
dplyr::select(-time, -cid) %>%
dplyr::mutate(
# cid = factor(cid, levels = c(0, 1), labels = c("Censored", "Failure")),
trt = factor(trt, levels = c(0, 1), labels = c("Monotherapy", "Combination Therapy")),
homo = factor(homo, levels = c(0, 1), labels = c("No.homosexual.activity", "homosexual.activity")),
drugs = factor(drugs, levels = c(0, 1), labels = c("No.history.IV.drug.use", "history.IV.drug.use")),
oprior = factor(oprior, levels = c(0, 1), labels = c("ZDV antiretroviral therapy pre-175", "Non-ZDV antiretroviral therapy pre-175")),
z30 = factor(z30, levels = c(0 ,1), labels = c("No ZDV in the 30 days prior to 175", "Had ZDV in the 30 days prior to 175")),
race = factor(race, levels = c(0, 1), labels = c("white", "non-white")),
gender = factor(gender, levels = c(0, 1), labels = c("Female", "Male")),
str2 = factor(str2, levels = c(0, 1), labels = c("no antiretroviral history", "had antiretroviral history")),
symptom = factor(symptom, levels = c(0, 1), labels = c("asymptomatic", "symptomatic")),
# treat = factor(treat, levels = c(0, 1), labels = c("ZDV only", "others")),
offtrt = factor(offtrt, levels = c(0, 1), labels = c("no off-trt before 96±5 weeks ", "off-trt before 96±5 weeks")),
karnof = factor(karnof, levels = c(70, 80, 90, 100), labels = c("70", "80", "90", "100")),
strat = factor(strat, levels = c(1, 2, 3) , labels = c("Antiretroviral Naive", "> 1 but ≤ 52 weeks of prior antiretroviral therapy", "> 52 weeks"))
)
# Separate continuous and categorical variables
continuous_vars <- aids.table %>% dplyr::select(where(is.numeric)) # Select only numeric (continuous) columns
categorical_vars <- aids.table %>% dplyr::select(where(is.factor)) # Select only categorical variables
# Recombine them in the desired order: Continuous first, then categorical
aids_reordered <- bind_cols(continuous_vars, categorical_vars)
# Run tbl_summary on the reordered dataset
aids_reordered %>%
tbl_summary(
by = trt,
statistic = list(
all_continuous() ~ "{mean} ± {sd}",
all_categorical() ~ "{n} ({p}%)"
),
digits = list(
all_continuous() ~ 1, # Round continuous values to 2 decimals
all_categorical() ~ c(0, 0) # Round count to integer, percentage to 1 decimal
),
missing = "no" # Exclude missing category
) %>%
bold_labels() %>% # Bold variable names
modify_header(label ~ "**Variables**") %>% # Change label column name to "Variable"
add_overall(
statistic = list(
all_continuous() ~ "{mean} ± {sd}",
all_categorical() ~ "{n} ({p}%)"
),
digits = list(
all_continuous() ~ 1,
all_categorical() ~ c(0, 0)
),
) %>%
# modify_table_body( fun = ~ .x %>% arrange(variable) ) %>% # arrange variables alphabetically
modify_table_body(
fun = ~ .x %>% arrange(match(variable, names(aids_reordered)))
) %>%
modify_header(label ~ "**Variable**") %>%
modify_spanning_header(c("stat_1", "stat_2") ~ "**Treatment**")
| Variable | Overall, N = 2,1391 | Treatment | |
|---|---|---|---|
| Monotherapy, N = 1,0931 | Combination Therapy, N = 1,0461 | ||
| age | 35.2 ± 8.7 | 35.2 ± 8.7 | 35.3 ± 8.7 |
| wtkg | 75.1 ± 13.3 | 75.4 ± 13.1 | 74.8 ± 13.4 |
| preanti | 379.2 ± 468.7 | 379.3 ± 470.6 | 379.0 ± 466.8 |
| treat | 1,607 (75%) | 561 (51%) | 1,046 (100%) |
| cd40 | 350.5 ± 118.6 | 350.3 ± 114.2 | 350.8 ± 123.0 |
| cd80 | 986.6 ± 480.2 | 979.4 ± 489.3 | 994.2 ± 470.6 |
| cd820 | 935.4 ± 445.0 | 936.0 ± 465.3 | 934.7 ± 422.9 |
| homo | |||
| No.homosexual.activity | 725 (34%) | 373 (34%) | 352 (34%) |
| homosexual.activity | 1,414 (66%) | 720 (66%) | 694 (66%) |
| drugs | |||
| No.history.IV.drug.use | 1,858 (87%) | 961 (88%) | 897 (86%) |
| history.IV.drug.use | 281 (13%) | 132 (12%) | 149 (14%) |
| karnof | |||
| 70 | 9 (0%) | 6 (1%) | 3 (0%) |
| 80 | 80 (4%) | 40 (4%) | 40 (4%) |
| 90 | 787 (37%) | 418 (38%) | 369 (35%) |
| 100 | 1,263 (59%) | 629 (58%) | 634 (61%) |
| oprior | |||
| ZDV antiretroviral therapy pre-175 | 2,092 (98%) | 1,068 (98%) | 1,024 (98%) |
| Non-ZDV antiretroviral therapy pre-175 | 47 (2%) | 25 (2%) | 22 (2%) |
| z30 | |||
| No ZDV in the 30 days prior to 175 | 962 (45%) | 498 (46%) | 464 (44%) |
| Had ZDV in the 30 days prior to 175 | 1,177 (55%) | 595 (54%) | 582 (56%) |
| race | |||
| white | 1,522 (71%) | 764 (70%) | 758 (72%) |
| non-white | 617 (29%) | 329 (30%) | 288 (28%) |
| gender | |||
| Female | 368 (17%) | 191 (17%) | 177 (17%) |
| Male | 1,771 (83%) | 902 (83%) | 869 (83%) |
| str2 | |||
| no antiretroviral history | 886 (41%) | 461 (42%) | 425 (41%) |
| had antiretroviral history | 1,253 (59%) | 632 (58%) | 621 (59%) |
| strat | |||
| Antiretroviral Naive | 886 (41%) | 461 (42%) | 425 (41%) |
| > 1 but ≤ 52 weeks of prior antiretroviral therapy | 410 (19%) | 198 (18%) | 212 (20%) |
| > 52 weeks | 843 (39%) | 434 (40%) | 409 (39%) |
| symptom | |||
| asymptomatic | 1,769 (83%) | 908 (83%) | 861 (82%) |
| symptomatic | 370 (17%) | 185 (17%) | 185 (18%) |
| offtrt | |||
| no off-trt before 96±5 weeks | 1,363 (64%) | 693 (63%) | 670 (64%) |
| off-trt before 96±5 weeks | 776 (36%) | 400 (37%) | 376 (36%) |
| 1 Mean ± SD; n (%) | |||
# add_p() # Add p-values
Group by treatment and plot the Kaplan-Meier curves.
Based on the survival curves, the effect of combination therapy becomes more beneficial in the long term.
data_directory <- "https://raw.githubusercontent.com/erickawaguchi/BIOSm215/master/functions/" #For later use
library(km.ci)
## Warning: package 'km.ci' was built under R version 4.3.3
source(paste0(data_directory, "conf_bands.R"))
# KME of S(t) for treatment groups
aids.surv <- survfit(Surv(time, cid) ~ trt, data = aids)
plot(aids.surv, lty = 1:2, col = c('blue', 'red'), main = ' Est. Survival Function for Treatment Groups', ylab = 'Estimates Survival Function', xlab = 'Time', conf.int = FALSE)
legend('bottomleft', c('Monotherapy', 'Combination'), col = c('blue', 'red'), lty = 1:2, bty = 'n')
Group by ZDV antiretroviral therapy pre-175 (oprior) and
plot the Kaplan-Meier curves.
plot(survfit(Surv(time, cid) ~ (oprior), data = aids, conf.type = 'none'), lty = 1:2, col = c('blue', 'red'), main = ' Est. Survival Function for Non-ZDV antiretroviral therapy pre-175', ylab = 'Estimates Survival Function', xlab = 'Time')
legend('bottomleft', c('No', 'Yes'), col = c('blue', 'red'), lty = 1:2, bty = 'n')
Group by ZDV in the 30 days prior to 175 (z30) and plot
the Kaplan-Meier curves.
plot(survfit(Surv(time, cid) ~ (z30), data = aids, conf.type = 'none'), lty = 1:2, col = c('blue', 'red'), main = ' Est. Survival Function for ZDV prior to 175', ylab = 'Estimates Survival Function', xlab = 'Time')
legend('bottomleft', c('No', 'Yes'), col = c('blue', 'red'), lty = 1:2, bty = 'n')
Group by gender and plot the Kaplan-Meier curves.
plot(survfit(Surv(time, cid) ~ gender, data = aids, conf.type = 'none'), lty = 1:2, col = c('blue', 'red'), main = ' Est. Survival Function for Gender Groups', ylab = 'Estimates Survival Function', xlab = 'Time')
legend('bottomleft', c('Female', 'Male'), col = c('blue', 'red'), lty = 1:2, bty = 'n')
Group by homosexual activity (homo) and plot the
Kaplan-Meier curves.
plot(survfit(Surv(time, cid) ~ homo, data = aids, conf.type = 'none'), lty = 1:2, col = c('blue', 'red'), main = ' Est. Survival Function for Two Sexual Orientation Groups', ylab = 'Estimates Survival Function', xlab = 'Time')
legend('bottomleft', c('No homosexual activity', 'has homosexual activity'), col = c('blue', 'red'), lty = 1:2, bty = 'n')
Group by interaction between race and
gender and plot the Kaplan-Meier curves.
aids.1 = aids %>%
mutate(demo = case_when(
gender == 0 & race == 0 ~ 1,
gender == 0 & race == 1 ~ 2,
gender == 1 & race == 0 ~ 3,
gender == 1 & race == 1 ~ 4
))
plot(survfit(Surv(time, cid) ~ demo, data = aids.1, conf.type = 'none'), lty = 1:4, col = c('blue', 'red','orange', 'green'), main = ' Est. Survival Function for Different Demographic Groups', ylab = 'Estimates Survival Function', xlab = 'Time')
legend('bottomleft', c('White Female', 'Non-white Female', "White Male", "Non-white Male"), col = c('blue', 'red','orange', 'green'), lty = 1:4, bty = 'n')
# Log-Rank Test (W = 1)
survdiff(Surv(time, cid) ~ trt, data = aids, rho = 0)
## Call:
## survdiff(formula = Surv(time, cid) ~ trt, data = aids, rho = 0)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=0 1093 309 255 11.4 22.4
## trt=1 1046 212 266 11.0 22.4
##
## Chisq= 22.4 on 1 degrees of freedom, p= 2e-06
#Log-Rank Test with Peto-Peto Weight
survdiff(Surv(time, cid) ~ trt, data = aids, rho = 1)
## Call:
## survdiff(formula = Surv(time, cid) ~ trt, data = aids, rho = 1)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=0 1093 272 223 10.9 24.5
## trt=1 1046 182 231 10.5 24.5
##
## Chisq= 24.5 on 1 degrees of freedom, p= 8e-07
fit <- ten(Surv(time, cid) ~ trt, data = aids) #Save as a "ten" object
comp(fit, p = c(1, 0.5, 0), q = c(0, 0.5, 1)) #Calculates log-rank test statistics
## Q Var Z pNorm
## 1 -5.4019e+01 1.3009e+02 -4.7362 3
## n -1.0111e+05 3.8641e+08 -5.1434 4
## sqrtN -2.3378e+03 2.2096e+05 -4.9734 5
## S1 -4.9288e+01 9.9391e+01 -4.9439 8
## S2 -4.9262e+01 9.9273e+01 -4.9442 7
## FH_p=1_q=0 -4.9349e+01 9.9618e+01 -4.9443 6
## FH_p=0.5_q=0.5 -1.3704e+01 1.3781e+01 -3.6916 1
## FH_p=0_q=1 -4.6703e+00 2.9092e+00 -2.7381 2
## maxAbsZ Var Q pSupBr
## 1 5.7459e+01 1.3009e+02 5.0377 8
## n 1.0451e+05 3.8641e+08 5.3164 2
## sqrtN 2.4446e+03 2.2096e+05 5.2005 3
## S1 5.1823e+01 9.9391e+01 5.1982 6
## S2 5.1795e+01 9.9273e+01 5.1984 5
## FH_p=1_q=0 5.1887e+01 9.9618e+01 5.1986 4
## FH_p=0.5_q=0.5 1.5222e+01 1.3781e+01 4.1005 7
## FH_p=0_q=1 5.5820e+00 2.9092e+00 3.2727 1
fit <- basehaz(coxph(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
strata(factor(trt)),
data = aids, ties = 'breslow'), centered = TRUE)
# Create individual data frames
mono.treatment <- data.frame(
"time" = fit$time[fit$strata == 0],
"H1" = fit$hazard[fit$strata == 0])
combined.treatment <- data.frame(
"time" = fit$time[fit$strata == 1],
"H2" = fit$hazard[fit$strata == 1])
# List of data frames
df_list <- list(mono.treatment, combined.treatment)
# Merge all data frames by 'time'
merged_df <- reduce(df_list, full_join, by = "time") %>%
arrange(time) %>%
na.locf(na.rm = FALSE) # Fill missing values
# # Create the plot
# plot(log(merged_df$H1) ~ merged_df$time, type = 's',
# ylab = 'Log Cumulative Hazard', xlab = 'Time',
# main = 'Log H(t) vs. Time', col = "red", lty = 1, ylim = c(-3.5, 1))
#
# # Add lines for other groups
# lines(log(merged_df$H2) ~ merged_df$time, type = "s", lty = 1, col = "blue")
#
# # Add a legend
# legend("bottomright", c("mono treatment", "combined treatment"),
# lty = 1, col = c("red", "blue", "green", "purple"),
# bty = "n", cex = .9)
#Convert to long format for ggplot
long_df <- merged_df %>%
pivot_longer(cols = starts_with("H"), names_to = "Group", values_to = "Hazard")
# Replace group names for better legend readability
long_df$Group <- recode(long_df$Group,
"H1" = "Mono Treatment",
"H2" = "Combined Treatment")
# Plot using ggplot2
ggplot(long_df, aes(x = time, y = log(Hazard), color = Group)) +
geom_step(size = 1.2, linetype = "solid") + # Thicker lines for visibility
labs(title = NULL, # Remove title
x = "Time",
y = expression(log(H(t)))) + # Use math notation for log(H(t))
scale_color_manual(values = c("#D55E00", "#0072B2", "#009E73", "#CC79A7")) + # Scientific color palette
theme_classic(base_size = 16) + # Use a clean, high-impact journal theme
theme(
axis.title = element_text(face = "bold", size = 18), # Bold axis labels
axis.text = element_text(size = 16), # Large tick labels
legend.title = element_blank(), # No legend title
legend.position = "bottom", # Position legend for clarity
legend.text = element_text(size = 16), # Larger legend text
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank() # Remove minor grid lines
)
Baseline cumulative hazard plot
fit <- basehaz(coxph(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
strata(factor(trt)),
data = aids, ties = 'breslow'), centered = TRUE)
# Create individual data frames
mono.treatment <- data.frame(
"time" = fit$time[fit$strata == 0],
"H1" = fit$hazard[fit$strata == 0])
combined.treatment <- data.frame(
"time" = fit$time[fit$strata == 1],
"H2" = fit$hazard[fit$strata == 1])
# List of data frames
df_list <- list(mono.treatment, combined.treatment)
# Merge all data frames by 'time'
merged_df <- reduce(df_list, full_join, by = "time") %>%
arrange(time) %>%
na.locf(na.rm = FALSE) # Fill missing values
# # Create the plot
plot(log(merged_df$H1) ~ merged_df$time, type = 's',
ylab = 'Log Cumulative Hazard', xlab = 'Time',
main = 'Log H(t) vs. Time', col = "red", lty = 1, ylim = c(-8.5, 1))
# Add lines for other groups
lines(log(merged_df$H2) ~ merged_df$time, type = "s", lty = 1, col = "blue")
# Add a legend
legend("bottomright", c("mono treatment", "combined treatment"),
lty = 1, col = c("red", "blue", "green", "purple"),
bty = "n", cex = .9)
Check for the proportional hazards assumption by Anderson plot
reptime <- function(l, t, range_of_time){
x <- numeric(length(range_of_time))
for(i in seq(range_of_time)){
diff <- range_of_time[i] - t
diff <- diff[diff >= 0]
if(length(diff) > 0){
x[i] <- l[which.min(diff)]
}else{
x[i] <- 0
}
}
return(x)
}
fit3 <- basehaz(coxph(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
strata(factor(trt)),
data = aids, ties = 'breslow'), centered = F)
H1 <- fit3$hazard[fit3$strata == 0]
H2 <- fit3$hazard[fit3$strata == 1]
t1 <- fit3$time[fit3$strata == 0]
t2 <- fit3$time[fit3$strata == 1]
H1 <- reptime(H1, t1, 1:2000)
H2 <- reptime(H2, t2, 1:2000)
par(mfrow = c(1, 2)) # Set up side-by-side plots
# First Plot
plot(log(merged_df$H1) ~ merged_df$time, type = 's',
ylab = 'Log H(t)',
xlab = 'Time',
main = element_blank(),
col = "red", lty = 1, ylim = c(-8.5, 1))
# Add lines for other groups
lines(log(merged_df$H2) ~ merged_df$time, type = "s", lty = 1, col = "blue")
# Add a legend
legend("topleft", c("mono treatment", "combined treatment"),
lty = 1, col = c("red", "blue"),
bty = "n", cex = .9)
# Add label (a)
mtext("(a)", side = 3, line = 0.5, adj = 0, cex = 1.2, font = 2)
# Second Plot
plot(H2 ~ H1, main = element_blank(),
ylab = 'combined treatment', xlab = 'mono treatment',
type = 's', xlim = c(0, 1), ylim = c(0, 1))
# Add 45-degree reference line
# Fit a regression model through the origin
model <- lm(H2 ~ H1 - 1) # '-1' removes the intercept
# Add the fitted regression line
abline(model, col = "red", lwd = 2, lty = 2) # Blue regression line
# Add label for the reference line
legend("topleft", legend = "Reference line",
col = "red", lty = 2, bty = "n")
# Add label (b)
mtext("(b)", side = 3, line = 0.5, adj = 0, cex = 1.2, font = 2)
Check for the proportional hazards assumption using the Schoenfeld residuals test.
aids.fit <- coxph(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, ties = 'breslow')
summary(aids.fit)
## Call:
## coxph(formula = Surv(time, cid) ~ age + wtkg + factor(homo) +
## factor(drugs) + karnof + factor(oprior) + factor(z30) + preanti +
## factor(race) + factor(gender) + factor(str2) + strat + factor(symptom) +
## factor(offtrt) + cd80 + cd820 + factor(trt), data = aids,
## ties = "breslow")
##
## n= 2139, number of events= 521
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.0119096 1.0119809 0.0052362 2.274 0.02294 *
## wtkg 0.0019335 1.0019353 0.0034800 0.556 0.57849
## factor(homo)1 -0.0369980 0.9636781 0.1267317 -0.292 0.77033
## factor(drugs)1 -0.4261507 0.6530180 0.1509507 -2.823 0.00476 **
## karnof -0.0216934 0.9785402 0.0071612 -3.029 0.00245 **
## factor(oprior)1 0.1847325 1.2028966 0.2733062 0.676 0.49909
## factor(z30)1 0.3100522 1.3634962 0.2193701 1.413 0.15755
## preanti 0.0002707 1.0002707 0.0001562 1.733 0.08306 .
## factor(race)1 -0.0126148 0.9874644 0.1101237 -0.115 0.90880
## factor(gender)1 0.1052503 1.1109886 0.1632832 0.645 0.51919
## factor(str2)1 0.1122370 1.1187780 0.2993203 0.375 0.70768
## strat -0.0775970 0.9253373 0.1598124 -0.486 0.62729
## factor(symptom)1 0.5371378 1.7111023 0.1033624 5.197 2.03e-07 ***
## factor(offtrt)1 0.7671321 2.1535812 0.0911278 8.418 < 2e-16 ***
## cd80 0.0005666 1.0005667 0.0001335 4.244 2.20e-05 ***
## cd820 -0.0004446 0.9995555 0.0001535 -2.897 0.00377 **
## factor(trt)1 -0.4387301 0.6448548 0.0900759 -4.871 1.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.0120 0.9882 1.0016 1.0224
## wtkg 1.0019 0.9981 0.9951 1.0088
## factor(homo)1 0.9637 1.0377 0.7517 1.2354
## factor(drugs)1 0.6530 1.5314 0.4858 0.8778
## karnof 0.9785 1.0219 0.9649 0.9924
## factor(oprior)1 1.2029 0.8313 0.7040 2.0553
## factor(z30)1 1.3635 0.7334 0.8870 2.0960
## preanti 1.0003 0.9997 1.0000 1.0006
## factor(race)1 0.9875 1.0127 0.7958 1.2253
## factor(gender)1 1.1110 0.9001 0.8067 1.5300
## factor(str2)1 1.1188 0.8938 0.6222 2.0115
## strat 0.9253 1.0807 0.6765 1.2657
## factor(symptom)1 1.7111 0.5844 1.3973 2.0954
## factor(offtrt)1 2.1536 0.4643 1.8013 2.5747
## cd80 1.0006 0.9994 1.0003 1.0008
## cd820 0.9996 1.0004 0.9993 0.9999
## factor(trt)1 0.6449 1.5507 0.5405 0.7694
##
## Concordance= 0.677 (se = 0.012 )
## Likelihood ratio test= 200.9 on 17 df, p=<2e-16
## Wald test = 212.3 on 17 df, p=<2e-16
## Score (logrank) test = 218.2 on 17 df, p=<2e-16
(schoenfeld_test <- cox.zph(aids.fit))
## chisq df p
## age 1.11449 1 0.2911
## wtkg 0.31819 1 0.5727
## factor(homo) 0.22532 1 0.6350
## factor(drugs) 0.46626 1 0.4947
## karnof 0.58677 1 0.4437
## factor(oprior) 0.49217 1 0.4830
## factor(z30) 0.01785 1 0.8937
## preanti 0.63321 1 0.4262
## factor(race) 0.00309 1 0.9557
## factor(gender) 0.00155 1 0.9686
## factor(str2) 0.20962 1 0.6471
## strat 0.13095 1 0.7174
## factor(symptom) 1.24924 1 0.2637
## factor(offtrt) 18.40673 1 1.8e-05
## cd80 2.11871 1 0.1455
## cd820 0.09158 1 0.7622
## factor(trt) 7.10497 1 0.0077
## GLOBAL 38.93975 17 0.0018
df = as.data.frame(round(schoenfeld_test$table, 2))
df$variable = c("age", "weight", "homo", "drugs", "karnof", "oprior", "z30", "preanti", "race", "gender", "str2", "strat", "symptom", "offtrt", "cd80", "cd820", "treatment", "GLOBAL")
df = data.frame("Variable" = df$variable, "Chisq" = df$chisq, "DF" = df$df, "p-value"= df$p)
ft_ph = flextable(df)
ft_ph = set_caption(ft_ph, caption = "Schoenfeld Residuals Test on PH Assumption")
ft_ph <- ft_ph %>%
bold(i = c(14, 17), bold = TRUE)
ft_ph
Variable | Chisq | DF | p.value |
|---|---|---|---|
age | 1.11 | 1 | 0.29 |
weight | 0.32 | 1 | 0.57 |
homo | 0.23 | 1 | 0.64 |
drugs | 0.47 | 1 | 0.49 |
karnof | 0.59 | 1 | 0.44 |
oprior | 0.49 | 1 | 0.48 |
z30 | 0.02 | 1 | 0.89 |
preanti | 0.63 | 1 | 0.43 |
race | 0.00 | 1 | 0.96 |
gender | 0.00 | 1 | 0.97 |
str2 | 0.21 | 1 | 0.65 |
strat | 0.13 | 1 | 0.72 |
symptom | 1.25 | 1 | 0.26 |
offtrt | 18.41 | 1 | 0.00 |
cd80 | 2.12 | 1 | 0.15 |
cd820 | 0.09 | 1 | 0.76 |
treatment | 7.10 | 1 | 0.01 |
GLOBAL | 38.94 | 17 | 0.00 |
Conclusion:
From the Schoenfeld Residuals Test, the variable treatment violates the PH assumption, so a Cox’s PH model is not appropriate.
par(mfrow = c(1, 2)) # Set up side-by-side plots
plot(schoenfeld_test)
# check the Schoenfeld residuals plots for treatment
plot(schoenfeld_test, var = 17)
abline(h = coef(aids.fit)[17], col = "red", lwd = 2)
df1 = as.data.frame(round(summary(aids.fit)$coef, 2))
df1$variable = c("age", "weight", "homo", "drugs", "karnof", "oprior", "z30", "preanti", "race", "gender", "str2", "strat", "symptom", "offtrt", "cd80", "cd820", "treatment")
df1 = df1[,c(6,1,2,3,4,5)]
ft_cox = flextable(df1)
ft_cox = set_caption(ft_cox, caption = "Cox's Model Coef Table")
ft_cox
variable | coef | exp(coef) | se(coef) | z | Pr(>|z|) |
|---|---|---|---|---|---|
age | 0.01 | 1.01 | 0.01 | 2.27 | 0.02 |
weight | 0.00 | 1.00 | 0.00 | 0.56 | 0.58 |
homo | -0.04 | 0.96 | 0.13 | -0.29 | 0.77 |
drugs | -0.43 | 0.65 | 0.15 | -2.82 | 0.00 |
karnof | -0.02 | 0.98 | 0.01 | -3.03 | 0.00 |
oprior | 0.18 | 1.20 | 0.27 | 0.68 | 0.50 |
z30 | 0.31 | 1.36 | 0.22 | 1.41 | 0.16 |
preanti | 0.00 | 1.00 | 0.00 | 1.73 | 0.08 |
race | -0.01 | 0.99 | 0.11 | -0.11 | 0.91 |
gender | 0.11 | 1.11 | 0.16 | 0.64 | 0.52 |
str2 | 0.11 | 1.12 | 0.30 | 0.37 | 0.71 |
strat | -0.08 | 0.93 | 0.16 | -0.49 | 0.63 |
symptom | 0.54 | 1.71 | 0.10 | 5.20 | 0.00 |
offtrt | 0.77 | 2.15 | 0.09 | 8.42 | 0.00 |
cd80 | 0.00 | 1.00 | 0.00 | 4.24 | 0.00 |
cd820 | 0.00 | 1.00 | 0.00 | -2.90 | 0.00 |
treatment | -0.44 | 0.64 | 0.09 | -4.87 | 0.00 |
Conclusions:
Sexual orientation and gender are non-significant predictors.
A hazard ratio of 0.6449 means that individuals receiving a combination therapy has a 35.5% lower hazard of failure compared to individuals receiving a mono-therapy. So a combination therapy is associated with better survival.
Stratified test of treatment effect while adjusting for gender
# Looking at females
survdiff(Surv(time, cid) ~ trt, data = aids, subset = (gender == 0))
## Call:
## survdiff(formula = Surv(time, cid) ~ trt, data = aids, subset = (gender ==
## 0))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=0 191 47 36.8 2.82 5.63
## trt=1 177 27 37.2 2.79 5.63
##
## Chisq= 5.6 on 1 degrees of freedom, p= 0.02
# Looking at males
survdiff(Surv(time, cid) ~ trt, data = aids, subset = (gender == 1))
## Call:
## survdiff(formula = Surv(time, cid) ~ trt, data = aids, subset = (gender ==
## 1))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=0 902 262 218 8.85 17.3
## trt=1 869 185 229 8.43 17.3
##
## Chisq= 17.3 on 1 degrees of freedom, p= 3e-05
# Stratified test
survdiff(Surv(time, cid) ~ trt + strata(gender), data = aids)
## Call:
## survdiff(formula = Surv(time, cid) ~ trt + strata(gender), data = aids)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=0 1093 309 255 11.5 22.5
## trt=1 1046 212 266 11.0 22.5
##
## Chisq= 22.5 on 1 degrees of freedom, p= 2e-06
Stratified test of treatment effect while adjusting for sexual orientation
# No homosexual activity
survdiff(Surv(time, cid) ~ trt, data = aids, subset = (homo == 0))
## Call:
## survdiff(formula = Surv(time, cid) ~ trt, data = aids, subset = (homo ==
## 0))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=0 373 94 74.4 5.14 10
## trt=1 352 59 78.6 4.87 10
##
## Chisq= 10 on 1 degrees of freedom, p= 0.002
# With homosexual activity
survdiff(Surv(time, cid) ~ trt, data = aids, subset = (homo == 1))
## Call:
## survdiff(formula = Surv(time, cid) ~ trt, data = aids, subset = (homo ==
## 1))
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=0 720 215 181 6.52 12.8
## trt=1 694 153 187 6.29 12.8
##
## Chisq= 12.8 on 1 degrees of freedom, p= 3e-04
# Stratified test
survdiff(Surv(time, cid) ~ trt + strata(homo), data = aids)
## Call:
## survdiff(formula = Surv(time, cid) ~ trt + strata(homo), data = aids)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## trt=0 1093 309 255 11.4 22.3
## trt=1 1046 212 266 10.9 22.3
##
## Chisq= 22.3 on 1 degrees of freedom, p= 2e-06
# Perform stepwise selection to find the best model according to AIC
best.model.aic <- stepAIC(aids.fit, direction="both",
k=2, trace=0)
summary(best.model.aic)
## Call:
## coxph(formula = Surv(time, cid) ~ age + factor(drugs) + karnof +
## factor(z30) + preanti + factor(symptom) + factor(offtrt) +
## cd80 + cd820 + factor(trt), data = aids, ties = "breslow")
##
## n= 2139, number of events= 521
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.0124647 1.0125427 0.0051020 2.443 0.01456 *
## factor(drugs)1 -0.4385999 0.6449387 0.1473238 -2.977 0.00291 **
## karnof -0.0221635 0.9780803 0.0070968 -3.123 0.00179 **
## factor(z30)1 0.3054998 1.3573032 0.1187689 2.572 0.01010 *
## preanti 0.0002320 1.0002320 0.0001126 2.060 0.03940 *
## factor(symptom)1 0.5362271 1.7095448 0.1016969 5.273 1.34e-07 ***
## factor(offtrt)1 0.7724633 2.1650930 0.0908059 8.507 < 2e-16 ***
## cd80 0.0005677 1.0005679 0.0001327 4.279 1.88e-05 ***
## cd820 -0.0004428 0.9995573 0.0001531 -2.891 0.00384 **
## factor(trt)1 -0.4383620 0.6450922 0.0898440 -4.879 1.07e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.0125 0.9876 1.0025 1.0227
## factor(drugs)1 0.6449 1.5505 0.4832 0.8608
## karnof 0.9781 1.0224 0.9646 0.9918
## factor(z30)1 1.3573 0.7368 1.0754 1.7131
## preanti 1.0002 0.9998 1.0000 1.0005
## factor(symptom)1 1.7095 0.5850 1.4006 2.0866
## factor(offtrt)1 2.1651 0.4619 1.8121 2.5868
## cd80 1.0006 0.9994 1.0003 1.0008
## cd820 0.9996 1.0004 0.9993 0.9999
## factor(trt)1 0.6451 1.5502 0.5409 0.7693
##
## Concordance= 0.675 (se = 0.012 )
## Likelihood ratio test= 199.2 on 10 df, p=<2e-16
## Wald test = 210.5 on 10 df, p=<2e-16
## Score (logrank) test = 216.2 on 10 df, p=<2e-16
n <- nrow(aids)
best.model.bic <- stepAIC(aids.fit, direction="both",
k=log(n), trace=0)
best.model.aic
## Call:
## coxph(formula = Surv(time, cid) ~ age + factor(drugs) + karnof +
## factor(z30) + preanti + factor(symptom) + factor(offtrt) +
## cd80 + cd820 + factor(trt), data = aids, ties = "breslow")
##
## coef exp(coef) se(coef) z p
## age 0.0124647 1.0125427 0.0051020 2.443 0.01456
## factor(drugs)1 -0.4385999 0.6449387 0.1473238 -2.977 0.00291
## karnof -0.0221635 0.9780803 0.0070968 -3.123 0.00179
## factor(z30)1 0.3054998 1.3573032 0.1187689 2.572 0.01010
## preanti 0.0002320 1.0002320 0.0001126 2.060 0.03940
## factor(symptom)1 0.5362271 1.7095448 0.1016969 5.273 1.34e-07
## factor(offtrt)1 0.7724633 2.1650930 0.0908059 8.507 < 2e-16
## cd80 0.0005677 1.0005679 0.0001327 4.279 1.88e-05
## cd820 -0.0004428 0.9995573 0.0001531 -2.891 0.00384
## factor(trt)1 -0.4383620 0.6450922 0.0898440 -4.879 1.07e-06
##
## Likelihood ratio test=199.2 on 10 df, p=< 2.2e-16
## n= 2139, number of events= 521
set.seed(123)
X <- as.matrix(aids %>% dplyr::select(-c(cid, time)))
survtime <- aids$time
delta <- aids$cid
fit <- cv.ncvsurv(X, Surv(survtime, delta), penalty = 'lasso',
nlambda = 10, nfolds = 10)
fit.lasso <- ncvsurv(X, Surv(survtime, delta), penalty = 'lasso',
lambda = fit$lambda.min)
# coef(fit.lasso)
# only keep/select the coef(fit.lasso) with non-zero coefficients
coef(fit.lasso)[coef(fit.lasso) != 0]
## trt age wtkg drugs karnof
## -0.4191515317 0.0110270068 0.0014611220 -0.3924863347 -0.0208918143
## oprior z30 preanti race gender
## 0.1498586663 0.2919157343 0.0002205492 -0.0008918510 0.0664088885
## str2 symptom offtrt cd80 cd820
## 0.0160700914 0.5180644577 0.7548336170 0.0005003386 -0.0003711303
names(coef(fit.lasso)[coef(fit.lasso) != 0])
## [1] "trt" "age" "wtkg" "drugs" "karnof" "oprior" "z30"
## [8] "preanti" "race" "gender" "str2" "symptom" "offtrt" "cd80"
## [15] "cd820"
lambda <- 10
dataFit <- createCyclopsData(Surv(survtime, delta) ~ X, modelType = "cox")
barPrior <- createBarPrior(penalty = log(lambda),
initialRidgeVariance = 1/log(lambda))
# barPrior <- createBarPrior(penalty="bic")
fit.bar <- fitCyclopsModel(dataFit, prior = barPrior)
# coef(fit.bar)
# only keep/select the coef(fit.bar) with non-zero coefficients
coef(fit.bar)[coef(fit.bar) != 0]
## Xtrt Xkarnof Xpreanti Xsymptom Xofftrt
## -0.3686930786 -0.0166092513 0.0004120094 0.5135137517 0.6983361593
fit <- cv.ncvsurv(X, Surv(survtime, delta), penalty = 'SCAD',
nlambda = 10, nfolds = 50)
fit.scad <- ncvsurv(X, Surv(survtime, delta), penalty = 'SCAD',
lambda = fit$lambda.min)
# coef(fit.scad)
# only keep/select the coef(fit.scad) with non-zero coefficients
coef(fit.scad)[coef(fit.scad) != 0]
## trt age drugs karnof oprior
## -0.4388024117 0.0108699363 -0.4132929945 -0.0222034021 0.0098459794
## z30 preanti symptom offtrt cd80
## 0.3037341392 0.0002358990 0.5374743537 0.7698823357 0.0005689677
## cd820
## -0.0004436918
fit <- cv.ncvsurv(X, Surv(survtime, delta), penalty = 'MCP',
nlambda = 30, nfolds = 10)
fit.MCP <- ncvsurv(X, Surv(survtime, delta), penalty = 'MCP',
lambda = fit$lambda.min)
# coef(fit.MCP)
# only keep/select the coef(fit.MCP) with non-zero coefficients
coef(fit.MCP)[coef(fit.MCP) != 0]
## trt age wtkg drugs karnof
## -0.4383918255 0.0122983811 0.0003532726 -0.4333006431 -0.0219130789
## oprior z30 preanti gender symptom
## 0.0863840320 0.3114592643 0.0002280159 0.0195643206 0.5345946432
## offtrt cd80 cd820
## 0.7725227913 0.0005668817 -0.0004419579
# List of variables from different selection results
coef_AIC <- c("age", "drugs", "karnof", "z30", "preanti", "symptom",
"offtrt", "cd80", "cd820", "trt")
coef_BIC <- c("age", "drugs", "karnof", "z30", "preanti", "symptom",
"offtrt", "cd80", "cd820", "trt")
# Extract nonzero coefficient names from each model
coef_lasso <- names(coef(fit.lasso))[coef(fit.lasso) != 0]
coef_scad <- names(coef(fit.scad))[coef(fit.scad) != 0]
coef_mcp <- names(coef(fit.MCP))[coef(fit.MCP) != 0]
# Find intersection of selected variables across models
common_variables <- Reduce(intersect, list(coef_lasso, coef_scad, coef_mcp,
coef_AIC, coef_BIC))
common_variables
## [1] "trt" "age" "drugs" "karnof" "z30" "preanti" "symptom"
## [8] "offtrt" "cd80" "cd820"
The common variables selected by all methods are: trt,
age, drugs, karnof,
z30, preanti, symptom,
offtrt, cd80, cd820.
Prepare the data
aids <- aids %>%
mutate(across(c(homo, drugs, oprior, z30, race, gender, str2, symptom, offtrt, trt), as.factor))
# AFT model with exponential distribution
# Fit AFT model with exponential distribution
aids.aft.exponential <- survreg(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "exponential", x=TRUE, y=TRUE)
# AFT model with Weibull distribution
aids.aft.weibull <- survreg(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "weibull")
# AFT model with Gaussian distribution
aids.aft.gaussian <- survreg(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "gaussian")
# AFT model with logistic
aids.aft.logistic <- survreg(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "logistic")
# AFT model with Lognormal
aids.aft.lognormal <- survreg(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "lognormal")
# AFT model with loglogistic
aids.aft.loglogistic <- survreg(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "loglogistic")
# select the model with the highest log likelihood
logLik_values <- c(
exponential = logLik(aids.aft.exponential),
weibull = logLik(aids.aft.weibull),
gaussian = logLik(aids.aft.gaussian),
logistic = logLik(aids.aft.logistic),
lognormal = logLik(aids.aft.lognormal),
loglogistic = logLik(aids.aft.loglogistic)
)
rank(logLik_values)
## exponential weibull gaussian logistic lognormal loglogistic
## 3 4 2 1 6 5
print(round(logLik_values, 2))
## exponential weibull gaussian logistic lognormal loglogistic
## -4698.58 -4639.76 -4713.67 -4739.19 -4622.42 -4629.39
best_model <- names(which.max(logLik_values))
cat("The best AFT model based on log-likelihood is:", best_model)
## The best AFT model based on log-likelihood is: lognormal
#Get Cox-Snell Residuals
par(mfrow = c(3,2), mar = c(4.5, 4.5, 2, 1), oma = c(3, 3, 2, 1),
mgp = c(2.5, 1, 0), las = 1, cex.lab = 1.5, cex.axis = 1.2)
#Logistic
eta <- predict(aids.aft.logistic, type = 'lp')
sigma <- aids.aft.logistic$scale
# Define a range of time points for estimation
time_seq <- seq(min(aids$time), max(aids$time), length.out = 100)
r.log <- -log(1 - plogis((aids$time - eta) / sigma))
fit <- survfit(Surv(r.log, aids$cid) ~ 1)
H.log <- cumsum(fit$n.event / fit$n.risk)
plot(H.log ~ fit$time, type = 'l', main = 'Logistic Regression',
ylab = element_blank(), xlab = element_blank(), font.lab = 2, cex.lab = 1.5)
abline(0, 1, col='red', lty=2)
mtext("(a)", side = 3, line = 0.5, adj = 0, cex = 1.2, font = 2)
#Gaussian
sigma <- aids.aft.gaussian$scale
eta <- -aids.aft.gaussian$linear.predictors / sigma
r.g <- -log(1 - pnorm((aids$time - aids.aft.gaussian$linear.predictors) / sigma))
fit <- survfit(Surv(r.g, aids$cid) ~ 1)
H.g <- cumsum(fit$n.event / fit$n.risk)
plot(H.g ~ fit$time, type = 'l', main = 'Gaussian Regression',
ylab = element_blank(), xlab = element_blank())
abline(0, 1, col='red', lty=2)
mtext("(b)", side = 3, line = 0.5, adj = 0, cex = 1.2, font = 2)
#Exponential
sigma <- aids.aft.exponential$scale
eta <- -aids.aft.exponential$linear.predictors/sigma
r.exp <- aids$time * exp(eta)
fit <- survfit(Surv(r.exp, aids$cid) ~ 1)
H.exp <- cumsum(fit$n.event / fit$n.risk)
plot(H.exp ~ fit$time, type = 'l', main = 'Exponential Regression',
ylab = 'Estimated Cumulative Hazard', xlab = element_blank(), font.lab = 2, cex.lab = 1.5)
abline(0, 1, col='red', lty=2)
mtext("(c)", side = 3, line = 0.5, adj = 0, cex = 1.2, font = 2)
#Weibull
sigma <- aids.aft.weibull$scale
alpha <- 1 / sigma
eta <- -aids.aft.weibull$linear.predictors / sigma
r.wb <- aids$time^alpha * exp(eta)
fit <- survfit(Surv(r.wb, aids$cid) ~ 1)
H.wb <- cumsum(fit$n.event/fit$n.risk)
plot(H.wb ~ fit$time, type = 'l', main = 'Weibull Regression',
ylab = element_blank(), xlab = element_blank(), font.lab = 2, cex.lab = 1.5)
abline(0, 1, col='red', lty=2)
mtext("(d)", side = 3, line = 0.5, adj = 0, cex = 1.2, font = 2)
#Log-Logistic
sigma <- aids.aft.loglogistic$scale
alpha <- 1 / sigma
eta <- -aids.aft.loglogistic$linear.predictors / sigma
r.ll <- -log(1/(1 + aids$time^alpha*exp(eta)))
fit <- survfit(Surv(r.ll, aids$cid) ~ 1)
H.ll <- cumsum(fit$n.event / fit$n.risk)
plot(H.ll ~ fit$time, type = 'l', main = 'Log-Logistic Regression',
ylab = element_blank(), xlab = 'Cox-Snell Residual', font.lab = 2, cex.lab = 1.5)
abline(0, 1, col='red', lty=2)
mtext("(e)", side = 3, line = 0.5, adj = 0, cex = 1.2, font = 2)
#Log-Normal Distribution
eta <- -aids.aft.lognormal$linear.predictors / aids.aft.lognormal$scale
r.ln <- -log(1 - pnorm((log(aids$time) - aids.aft.lognormal$linear.predictors) / aids.aft.lognormal$scale))
fit <- survfit(Surv(r.ln, aids$cid) ~ 1)
H.ln <- cumsum(fit$n.event/fit$n.risk)
plot(H.ln ~ fit$time, type = 'l', main = 'Log-Normal Regression',
ylab = element_blank(), xlab = 'Cox-Snell Residual', font.lab = 2, cex.lab = 1.5)
abline(0, 1, col='red', lty = 2)
mtext("(f)", side = 3, line = 0.5, adj = 0, cex = 1.2, font = 2)
# Extract AIC for each model
aic_values <- c(
exponential = AIC(aids.aft.exponential),
weibull = AIC(aids.aft.weibull),
gaussian = AIC(aids.aft.gaussian),
logistic = AIC(aids.aft.logistic),
lognormal = AIC(aids.aft.lognormal),
loglogistic = AIC(aids.aft.loglogistic)
)
# Print all AIC values
print(aic_values)
## exponential weibull gaussian logistic lognormal loglogistic
## 9433.156 9317.510 9465.340 9516.372 9282.835 9296.777
# Identify the model with the lowest AIC
best_aic_model <- names(which.min(aic_values))
best_aic_value <- min(aic_values)
cat("The best AFT model based on AIC is:", best_aic_model, "with an AIC of", best_aic_value, "\n")
## The best AFT model based on AIC is: lognormal with an AIC of 9282.835
Therefore, the best AFT model is the Lognormal
distribution based on the log-likelihood, Harrell’s C statistic, Uno’s
C, and AIC.
The AFT model assumes a log-linear relationship between survival time and covariates:
\[ log(T) = \mathbf{X} \mathbf{\beta} + \epsilon \]
Plot Log(Survival Time) vs. Continuous Covariates
library(ggplot2)
library(dplyr)
aids %>%
mutate(log_time = log(time)) %>%
pivot_longer(cols = c(age, wtkg, cd80, cd820), names_to = "Variable", values_to = "Value") %>%
ggplot(aes(x = Value, y = log_time)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
facet_wrap(~Variable, scales = "free_x") +
theme_minimal() +
labs(title = "Checking Log-Linearity: Log(Time) vs. Covariates",
x = "Covariate Value",
y = "Log(Survival Time)")
In an Accelerated Failure Time (AFT) model with a lognormal distribution, the theoretical residuals (ϵ) follow a normal distribution. So, we can check the normality assumption by plotting the residuals.
In order to check whether the assumed distribution of survival times fits the observed data sufficiently well, we can use the residuals of the model. However, because the residuals are calculated based on the observed event times, they will also be censored. Hence, to investigate whether they follow a particular distribution, we will need to employ a graphical procedure accounting for censoring. One way to achieve this is by using the Kaplan-Meier estimator of the residuals.
The definition of the residuals in the AFT model is given by:
\[ \frac{log(T_i) - \hat{\mu}_i}{\hat{\sigma}} \]
where \(T_i\) is the observed survival time, \(\hat{\mu}_i\) is the predicted survival time, and \(\hat{\sigma}\) is the estimated scale parameter of the AFT model. Therefore, construct the residuals based on their definition:
fitted_values <- aids.aft.lognormal$linear.predictors
resids <- (log(aids.aft.lognormal$y[, 1]) - fitted_values) / aids.aft.lognormal$scale
We then compute the Kaplan-Meier estimate of these residuals, and we plot it.
resKM <- survfit(Surv(resids, cid) ~ 1, data = aids)
plot(resKM, mark.time = FALSE, xlab = "AFT Residuals", ylab = "Survival Probability")
xx <- seq(min(resids), max(resids), length.out = 35)
yy <- 1-pnorm(xx)
lines(xx, yy, col = "red", lwd = 2)
legend("bottomleft", c("KM estimate", "95% CI for KM estimate",
"Survival function of standard normal"),
lty = c(1,2,1), col = c(1,1,2), bty = "n")
Explore whether lognormal AFT model has interactions between treatment and gender/sexual orientation
aids.aft.lognormal.interactions <- survreg(Surv(time, cid) ~ age + wtkg +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt) * factor(homo) + factor(trt) * factor(gender),
data = aids, dist = "lognormal")
summary(aids.aft.lognormal.interactions)
##
## Call:
## survreg(formula = Surv(time, cid) ~ age + wtkg + factor(drugs) +
## karnof + factor(oprior) + factor(z30) + preanti + factor(race) +
## factor(str2) + strat + factor(symptom) + factor(offtrt) +
## cd80 + cd820 + factor(trt) * factor(homo) + factor(trt) *
## factor(gender), data = aids, dist = "lognormal")
## Value Std. Error z p
## (Intercept) 6.65e+00 5.62e-01 11.82 < 2e-16
## age -6.03e-03 3.55e-03 -1.70 0.08933
## wtkg -5.14e-04 2.40e-03 -0.21 0.83036
## factor(drugs)1 2.66e-01 9.87e-02 2.70 0.00695
## karnof 1.68e-02 5.09e-03 3.31 0.00092
## factor(oprior)1 -2.81e-01 2.01e-01 -1.40 0.16279
## factor(z30)1 -2.20e-01 1.54e-01 -1.43 0.15242
## preanti -1.95e-04 1.15e-04 -1.69 0.09096
## factor(race)1 5.23e-03 7.36e-02 0.07 0.94335
## factor(str2)1 -5.26e-02 2.06e-01 -0.26 0.79864
## strat 5.19e-02 1.15e-01 0.45 0.65059
## factor(symptom)1 -3.99e-01 7.61e-02 -5.25 1.6e-07
## factor(offtrt)1 -5.68e-01 6.36e-02 -8.93 < 2e-16
## cd80 -4.30e-04 9.49e-05 -4.53 6.0e-06
## cd820 3.61e-04 1.05e-04 3.42 0.00062
## factor(trt)1 3.40e-01 1.57e-01 2.16 0.03042
## factor(homo)1 7.46e-02 1.13e-01 0.66 0.50753
## factor(gender)1 -1.46e-01 1.42e-01 -1.02 0.30709
## factor(trt)1:factor(homo)1 -1.46e-01 1.67e-01 -0.88 0.38119
## factor(trt)1:factor(gender)1 1.36e-01 2.15e-01 0.63 0.52663
## Log(scale) 4.20e-02 3.52e-02 1.19 0.23283
##
## Scale= 1.04
##
## Log Normal distribution
## Loglik(model)= -4622 Loglik(intercept only)= -4736.4
## Chisq= 228.69 on 19 degrees of freedom, p= 6.2e-38
## Number of Newton-Raphson Iterations: 4
## n= 2139
Conclusion:
The interaction terms are not significant, suggesting that the treatment effect is consistent across gender and sexual orientation groups.
# Variable selection using stepwise AIC
best.model.aic <- stepAIC(aids.aft.lognormal, direction="both",
k=2, trace=0)
summary(best.model.aic)
##
## Call:
## survreg(formula = Surv(time, cid) ~ age + factor(drugs) + karnof +
## factor(oprior) + factor(z30) + preanti + factor(symptom) +
## factor(offtrt) + cd80 + cd820 + factor(trt), data = aids,
## dist = "lognormal")
## Value Std. Error z p
## (Intercept) 6.60e+00 5.19e-01 12.71 < 2e-16
## age -6.30e-03 3.46e-03 -1.82 0.06836
## factor(drugs)1 2.73e-01 9.68e-02 2.82 0.00475
## karnof 1.69e-02 5.06e-03 3.34 0.00083
## factor(oprior)1 -2.72e-01 1.85e-01 -1.47 0.14048
## factor(z30)1 -2.02e-01 8.17e-02 -2.47 0.01339
## preanti -1.63e-04 8.16e-05 -1.99 0.04637
## factor(symptom)1 -4.01e-01 7.54e-02 -5.32 1.1e-07
## factor(offtrt)1 -5.70e-01 6.34e-02 -8.99 < 2e-16
## cd80 -4.32e-04 9.46e-05 -4.57 4.8e-06
## cd820 3.59e-04 1.05e-04 3.41 0.00065
## factor(trt)1 3.53e-01 6.16e-02 5.73 1.0e-08
## Log(scale) 4.30e-02 3.52e-02 1.22 0.22215
##
## Scale= 1.04
##
## Log Normal distribution
## Loglik(model)= -4623 Loglik(intercept only)= -4736.4
## Chisq= 226.64 on 11 degrees of freedom, p= 2.1e-42
## Number of Newton-Raphson Iterations: 4
## n= 2139
n <- nrow(aids)
best.model.bic <- stepAIC(aids.aft.lognormal, direction="both",
k=log(n), trace=0)
best.model.bic
## Call:
## survreg(formula = Surv(time, cid) ~ karnof + preanti + factor(symptom) +
## factor(offtrt) + cd80 + cd820 + factor(trt), data = aids,
## dist = "lognormal")
##
## Coefficients:
## (Intercept) karnof preanti factor(symptom)1
## 6.2159930381 0.0181270251 -0.0003154474 -0.4073723504
## factor(offtrt)1 cd80 cd820 factor(trt)1
## -0.5514171184 -0.0004352931 0.0003698285 0.3548002879
##
## Scale= 1.048921
##
## Loglik(model)= -4632.2 Loglik(intercept only)= -4736.4
## Chisq= 208.4 on 7 degrees of freedom, p= <2e-16
## n= 2139
# Load required packages
library(survival)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(dplyr)
# Step 1: Prepare Data - Convert categorical variables into dummy variables
aids_processed <- model.matrix(Surv(time, cid) ~ age + wtkg + factor(homo) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(race) + factor(gender) +
factor(str2) + strat + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids)[,-1] # Remove intercept column
# Response variable (log of survival time, as AFT assumes log-linear form)
y <- log(aids$time)
# Censoring indicator
censoring <- aids$cid
# Step 2: Fit LASSO model
lasso_fit <- glmnet(aids_processed, y, family = "gaussian", alpha = 1) # LASSO (alpha=1)
# Step 3: Cross-validation to select optimal lambda (penalty parameter)
cv_lasso <- cv.glmnet(aids_processed, y, family = "gaussian", alpha = 1)
# Step 4: Identify optimal lambda
best_lambda <- cv_lasso$lambda.min
# Step 5: Extract selected variables (non-zero coefficients)
lasso_coefficients <- coef(cv_lasso, s = best_lambda) # Get coefficients at optimal lambda
selected_variables <- rownames(lasso_coefficients)[lasso_coefficients[,1] != 0] # Remove zero coefficients
# Display selected variables
print(selected_variables)
## [1] "(Intercept)" "factor(homo)1" "factor(drugs)1" "karnof"
## [5] "factor(oprior)1" "factor(z30)1" "preanti" "factor(race)1"
## [9] "factor(symptom)1" "factor(offtrt)1" "cd80" "cd820"
## [13] "factor(trt)1"
The common variables selected by all methods are: trt,
age, drugs, karnof,
oprior, z30, preanti,
symptom, offtrt, cd80,
cd820.
best.aft.model <- survreg(Surv(time, cid) ~ age + factor(homo) + factor(race) +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "lognormal")
summary(best.aft.model)
##
## Call:
## survreg(formula = Surv(time, cid) ~ age + factor(homo) + factor(race) +
## factor(drugs) + karnof + factor(oprior) + factor(z30) + preanti +
## factor(symptom) + factor(offtrt) + cd80 + cd820 + factor(trt),
## data = aids, dist = "lognormal")
## Value Std. Error z p
## (Intercept) 6.62e+00 5.24e-01 12.64 < 2e-16
## age -5.95e-03 3.51e-03 -1.69 0.09032
## factor(homo)1 -3.24e-02 7.12e-02 -0.46 0.64895
## factor(race)1 1.47e-02 7.27e-02 0.20 0.84025
## factor(drugs)1 2.64e-01 9.85e-02 2.68 0.00743
## karnof 1.67e-02 5.07e-03 3.30 0.00098
## factor(oprior)1 -2.72e-01 1.85e-01 -1.47 0.14083
## factor(z30)1 -2.05e-01 8.19e-02 -2.50 0.01240
## preanti -1.60e-04 8.20e-05 -1.95 0.05144
## factor(symptom)1 -3.95e-01 7.59e-02 -5.21 1.9e-07
## factor(offtrt)1 -5.71e-01 6.35e-02 -8.99 < 2e-16
## cd80 -4.31e-04 9.47e-05 -4.55 5.4e-06
## cd820 3.60e-04 1.05e-04 3.41 0.00064
## factor(trt)1 3.54e-01 6.17e-02 5.74 9.5e-09
## Log(scale) 4.31e-02 3.52e-02 1.22 0.22064
##
## Scale= 1.04
##
## Log Normal distribution
## Loglik(model)= -4622.9 Loglik(intercept only)= -4736.4
## Chisq= 226.97 on 13 degrees of freedom, p= 3.8e-41
## Number of Newton-Raphson Iterations: 4
## n= 2139
round(best.aft.model$coefficients, 3)
## (Intercept) age factor(homo)1 factor(race)1
## 6.618 -0.006 -0.032 0.015
## factor(drugs)1 karnof factor(oprior)1 factor(z30)1
## 0.264 0.017 -0.272 -0.205
## preanti factor(symptom)1 factor(offtrt)1 cd80
## 0.000 -0.395 -0.571 0.000
## cd820 factor(trt)1
## 0.000 0.354
summary(best.aft.model)$table[, "Std. Error"]
## (Intercept) age factor(homo)1 factor(race)1
## 5.236486e-01 3.514783e-03 7.117672e-02 7.273944e-02
## factor(drugs)1 karnof factor(oprior)1 factor(z30)1
## 9.851679e-02 5.073129e-03 1.849545e-01 8.187272e-02
## preanti factor(symptom)1 factor(offtrt)1 cd80
## 8.198066e-05 7.587983e-02 6.351147e-02 9.472054e-05
## cd820 factor(trt)1 Log(scale)
## 1.054477e-04 6.165573e-02 3.520680e-02
round(exp(best.aft.model$coefficients), 3)
## (Intercept) age factor(homo)1 factor(race)1
## 748.362 0.994 0.968 1.015
## factor(drugs)1 karnof factor(oprior)1 factor(z30)1
## 1.302 1.017 0.762 0.815
## preanti factor(symptom)1 factor(offtrt)1 cd80
## 1.000 0.673 0.565 1.000
## cd820 factor(trt)1
## 1.000 1.425
round(exp(confint(best.aft.model)), 3)
## 2.5 % 97.5 %
## (Intercept) 268.152 2088.537
## age 0.987 1.001
## factor(homo)1 0.842 1.113
## factor(race)1 0.880 1.170
## factor(drugs)1 1.073 1.579
## karnof 1.007 1.027
## factor(oprior)1 0.530 1.094
## factor(z30)1 0.694 0.957
## preanti 1.000 1.000
## factor(symptom)1 0.580 0.781
## factor(offtrt)1 0.499 0.640
## cd80 0.999 1.000
## cd820 1.000 1.001
## factor(trt)1 1.262 1.608
best.aft.model |>
tbl_regression(intercept = TRUE,
digits = list("estimate" ~ 3))
## Warning: The `exponentiate` argument is not supported in the `tidy()` method
## for `survreg` objects and will be ignored.
| Characteristic | Beta | 95% CI1 | p-value |
|---|---|---|---|
| (Intercept) | 6.6 | 5.6, 7.6 | <0.001 |
| age | -0.01 | -0.01, 0.00 | 0.090 |
| factor(homo) | |||
| 0 | — | — | |
| 1 | -0.03 | -0.17, 0.11 | 0.6 |
| factor(race) | |||
| 0 | — | — | |
| 1 | 0.01 | -0.13, 0.16 | 0.8 |
| factor(drugs) | |||
| 0 | — | — | |
| 1 | 0.26 | 0.07, 0.46 | 0.007 |
| karnof | 0.02 | 0.01, 0.03 | <0.001 |
| factor(oprior) | |||
| 0 | — | — | |
| 1 | -0.27 | -0.63, 0.09 | 0.14 |
| factor(z30) | |||
| 0 | — | — | |
| 1 | -0.20 | -0.37, -0.04 | 0.012 |
| preanti | 0.00 | 0.00, 0.00 | 0.051 |
| factor(symptom) | |||
| 0 | — | — | |
| 1 | -0.40 | -0.54, -0.25 | <0.001 |
| factor(offtrt) | |||
| 0 | — | — | |
| 1 | -0.57 | -0.70, -0.45 | <0.001 |
| cd80 | 0.00 | 0.00, 0.00 | <0.001 |
| cd820 | 0.00 | 0.00, 0.00 | <0.001 |
| factor(trt) | |||
| 0 | — | — | |
| 1 | 0.35 | 0.23, 0.47 | <0.001 |
| 1 CI = Confidence Interval | |||
The common variables selected by all methods are: trt, age, drugs, karnof, oprior, z30, preanti, symptom, offtrt, cd80, cd820.
Cox’s
aids.coxph <- coxph(Surv(time, cid) ~ age +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, ties = 'breslow', x=TRUE, y=TRUE)
Exponential
# AFT model with exponential distribution
# Fit AFT model with exponential distribution
aids.aft.exponential <- survreg(Surv(time, cid) ~ age +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "exponential", x=TRUE, y=TRUE)
Weibull
# AFT model with Weibull distribution
aids.aft.weibull <- survreg(Surv(time, cid) ~ age +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "weibull", x=TRUE, y=TRUE)
Gaussian
# AFT model with Gaussian distribution
aids.aft.gaussian <- survreg(Surv(time, cid) ~ age +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "gaussian", x=TRUE, y=TRUE)
Logistic
# AFT model with logistic
aids.aft.logistic <- survreg(Surv(time, cid) ~ age +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "logistic", x=TRUE, y=TRUE)
Lognormal
# AFT model with Lognormal
aids.aft.lognormal <- survreg(Surv(time, cid) ~ age +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "lognormal", x=TRUE, y=TRUE)
Loglogistic
# AFT model with loglogistic
aids.aft.loglogistic <- survreg(Surv(time, cid) ~ age +
factor(drugs) + karnof + factor(oprior) + factor(z30) +
preanti + factor(symptom) +
factor(offtrt) + cd80 + cd820 +
factor(trt),
data = aids, dist = "loglogistic", x=TRUE, y=TRUE)
# Psedo R2 for each model
PseudoR2 <- c(
coxph = pam.coxph(aids.coxph)$R.squared,
exponential = pam.coxph(aids.aft.exponential)$R.squared,
weibull = pam.coxph(aids.aft.weibull)$R.squared,
gaussian = pam.coxph(aids.aft.gaussian)$R.squared,
logistic = pam.coxph(aids.aft.logistic)$R.squared,
lognormal = pam.coxph(aids.aft.lognormal)$R.squared,
loglogistic = pam.coxph(aids.aft.loglogistic)$R.squared
)
## Warning in weight.km * (calibrate.fitted - sum(weight.km * y))^2: longer object
## length is not a multiple of shorter object length
## Warning in y - calibrate.fitted: longer object length is not a multiple of
## shorter object length
## Warning in weight.km * (calibrate.fitted - sum(weight.km * y))^2: longer object
## length is not a multiple of shorter object length
## Warning in y - calibrate.fitted: longer object length is not a multiple of
## shorter object length
# Print all Pseudo R2 values
print(PseudoR2)
## coxph exponential weibull gaussian logistic lognormal
## "0.0909" "0.0725" "0.0873" "NA" "NA" "0.0885"
## loglogistic
## "0.0883"
# Identify the model with the lowest Pseudo R2
best_pseudoR2_model <- names(which.min(PseudoR2))
## Warning in which.min(PseudoR2): NAs introduced by coercion
best_pseudoR2_value <- min(PseudoR2)
cat("The model with the lowest Pseudo R2 is", best_pseudoR2_model, "with a value of", best_pseudoR2_value, "\n")
## The model with the lowest Pseudo R2 is exponential with a value of 0.0725
# L_squared for each model
L_squared <- c(
coxph = pam.coxph(aids.coxph)$L.squared,
exponential = pam.coxph(aids.aft.exponential)$L.squared,
weibull = pam.coxph(aids.aft.weibull)$L.squared,
gaussian = pam.coxph(aids.aft.gaussian)$L.squared,
logistic = pam.coxph(aids.aft.logistic)$L.squared,
lognormal = pam.coxph(aids.aft.lognormal)$L.squared,
loglogistic = pam.coxph(aids.aft.loglogistic)$L.squared
)
## Warning in weight.km * (calibrate.fitted - sum(weight.km * y))^2: longer object
## length is not a multiple of shorter object length
## Warning in y - calibrate.fitted: longer object length is not a multiple of
## shorter object length
## Warning in weight.km * (calibrate.fitted - sum(weight.km * y))^2: longer object
## length is not a multiple of shorter object length
## Warning in y - calibrate.fitted: longer object length is not a multiple of
## shorter object length
# Print all L_squared values
print(L_squared)
## coxph exponential weibull gaussian logistic lognormal
## "0.2840" "0.1996" "0.2065" "NA" "NA" "0.2025"
## loglogistic
## "0.2041"
# Identify the model with the lowest L_squared
best_L_squared_model <- names(which.min(L_squared))
## Warning in which.min(L_squared): NAs introduced by coercion
best_L_squared_value <- min(L_squared)
cat("The model with the lowest L_squared is", best_L_squared_model, "with a value of", best_L_squared_value)
## The model with the lowest L_squared is exponential with a value of 0.1996
# Extract concordance index for each model
concordance_values <- c(
exponential = concordance(aids.aft.exponential)$concordance,
weibull = concordance(aids.aft.weibull)$concordance,
gaussian = concordance(aids.aft.gaussian)$concordance,
logistic = concordance(aids.aft.logistic)$concordance,
lognormal = concordance(aids.aft.lognormal)$concordance,
loglogistic = concordance(aids.aft.loglogistic)$concordance
)
# Print all concordance values
print(round(concordance_values, 4))
## exponential weibull gaussian logistic lognormal loglogistic
## 0.6758 0.6760 0.6770 0.6765 0.6773 0.6769
print(rank(concordance_values))
## exponential weibull gaussian logistic lognormal loglogistic
## 1 2 5 3 6 4
# Identify the model with the highest concordance
best_concordance_model <- names(which.max(concordance_values))
best_concordance_value <- max(concordance_values)
cat("The best AFT model based on concordance index is:", best_concordance_model, "with a C-index of", best_concordance_value, "\n")
## The best AFT model based on concordance index is: lognormal with a C-index of 0.6773082
# Extract Uno's C index for each model
uno_values <- c(
exponential = concordance(aids.aft.exponential, timewt="n/G2")$concordance,
weibull = concordance(aids.aft.weibull, timewt="n/G2")$concordance,
gaussian = concordance(aids.aft.gaussian, timewt="n/G2")$concordance,
logistic = concordance(aids.aft.logistic, timewt="n/G2")$concordance,
lognormal = concordance(aids.aft.lognormal, timewt="n/G2")$concordance,
loglogistic = concordance(aids.aft.loglogistic, timewt="n/G2")$concordance
)
# Print all Uno's C values
print(round(uno_values, 4))
## exponential weibull gaussian logistic lognormal loglogistic
## 0.6661 0.6662 0.6669 0.6666 0.6675 0.6671
print(rank(uno_values))
## exponential weibull gaussian logistic lognormal loglogistic
## 1 2 4 3 6 5
# Identify the model with the highest Uno's C
best_uno_model <- names(which.max(uno_values))
best_uno_value <- max(uno_values)
cat("The best AFT model based on Uno's C index is:", best_uno_model, "with a C-index of", best_uno_value, "\n")
## The best AFT model based on Uno's C index is: lognormal with a C-index of 0.6674623